emanuela.damieto@studenti.unitn.it
This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(magrittr)
library(parallel)
library(pheatmap)
library(plotly)
library(pvclust)
library(tidyverse)
library(tximport)
library(vsn)
library(emoji)
})source(here("UPSCb-common/src/R/featureSelection.R"))hpal <- colorRampPalette(c("blue","white","red"))(100)samples <- read_csv(here("data/drought_roots.csv"),
col_types=cols(.default=col_factor()))tx2gene <- suppressMessages(read_delim(here("reference/annotation/Picab02_tx2gene.tsv.gz"),
delim="\t", col_names=c("TXID","GENE"), skip=1))filelist <- list.files(here("results/Salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
samples$SciLifeID) == 1:nrow(samples)))
samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)samples_rep %<>% mutate(Filenames = filelist)write_tsv(samples_rep,here("data/samples_full_rank.txt"))txi <- suppressMessages(tximport(files = samples_rep$Filenames,
type = "salmon",
tx2gene=tx2gene))
counts <- txi$counts
samples_rep$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
colnames(counts) <- samples_rep$Filenames
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "8.3% percent (3627) of 43909 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples_rep)
ggplot(dat,aes(x,y,fill=Level)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(Level), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
labs(title ="Sample sequencing depth")👉 We observe some variation in the raw sequencing depth between conditions but it’s not so significant
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("Gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The gene mean raw counts distribution is as expected since the highest peak is around 2
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(Level=samples_rep$Level[match(ind,samples_rep$Filenames)])
ggplot(dat,aes(x=values,group=ind,col=Level)) +
geom_density(na.rm = TRUE) +
ggtitle("Sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other condition
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.
dds <- DESeqDataSetFromTximport(
txi=txi,
colData = samples_rep,
design = ~ Level)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
colnames(dds) <- samples_rep$Filenames
save(dds,file=here("data/analysis/salmon/dds.rda"))(i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()boxplot of the sequencing libraries size factor:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 The sequencing libraries size factor is highly variable across samples but similar for the technical replicates
Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,]) After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)After:
meanSdPlot(vst[rowSums(vst)>0,]) 👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)We define the number of variables of the model (=1) and the number of possible combinations (=8).
We plot the percentage explained by different components
nvar=1
nlevel=nlevels(dds$Level)
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
ggtitle("Percentage of variance explained by different components")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.
👉 Some conditions clusterized better than others in the PCA plot
Number of genes expressed per condition at different cutoffs:
conds <- factor(dds$Level)dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3) %>%
suppressWarnings()vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)vst.cutoff <- 5
hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="", cex.axis=2)mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
show_rownames = F,
labels_col = conds,
angle_col = 90,
legend = F)plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="", cex.axis=2)👉 The different conditions are not so different in gene expression level
Done to assess the previous dendrogram’s reproducibility
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 1000, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
Plot the clustering with bp and au
plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)👉 Some conditions clusterize better than others
print(hm.pvclust, digits=3)##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 0.622 0.832 0.754 0.034 0.021 0.005 -0.824 0.138 0.379
## 2 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 0.611 0.813 0.785 0.035 0.023 0.004 -0.840 0.050 0.540
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 0.133 0.640 0.472 0.040 0.029 0.005 -0.144 0.215 0.707
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 0.979 0.991 0.975 0.007 0.004 0.002 -2.167 0.207 0.426
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 0.200 0.643 0.538 0.039 0.029 0.005 -0.231 0.135 0.484
## 10 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 11 0.813 0.898 0.933 0.030 0.020 0.003 -1.385 -0.114 0.595
## 12 0.573 0.794 0.769 0.036 0.024 0.004 -0.778 0.042 0.356
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 0.590 0.826 0.713 0.035 0.021 0.005 -0.749 0.188 0.405
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 0.399 0.692 0.714 0.038 0.028 0.005 -0.534 -0.032 0.300
## 18 0.197 0.651 0.522 0.039 0.028 0.005 -0.223 0.166 0.531
## 19 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 20 0.896 0.957 0.900 0.018 0.009 0.003 -1.498 0.218 0.590
## 21 0.550 0.810 0.688 0.036 0.022 0.005 -0.683 0.194 0.863
## 22 0.402 0.731 0.643 0.038 0.026 0.005 -0.490 0.124 0.485
## 23 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 24 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 25 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 26 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 27 0.026 0.535 0.490 0.035 0.031 0.005 -0.031 0.056 0.138
## 28 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 29 0.015 0.582 0.431 0.038 0.030 0.005 -0.016 0.191 0.213
## 30 0.948 0.977 0.956 0.013 0.007 0.002 -1.850 0.142 0.657
## 31 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 32 0.148 0.631 0.499 0.039 0.029 0.005 -0.166 0.168 0.229
## 33 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 34 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 35 0.827 0.927 0.852 0.023 0.013 0.004 -1.252 0.205 0.984
## 36 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 37 0.095 0.605 0.479 0.038 0.030 0.005 -0.107 0.161 0.258
## 38 0.849 0.942 0.839 0.021 0.010 0.004 -1.280 0.290 0.673
## 39 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 40 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 41 0.633 0.818 0.811 0.035 0.023 0.004 -0.896 0.013 0.578
## 42 0.417 0.735 0.655 0.038 0.026 0.005 -0.514 0.114 0.352
## 43 0.000 0.551 0.419 0.000 0.031 0.005 0.038 0.167 0.705
## 44 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 45 0.014 0.516 0.498 0.034 0.031 0.005 -0.017 0.023 0.334
## 46 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 47 0.975 0.989 0.975 0.009 0.004 0.002 -2.131 0.167 0.669
## 48 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 49 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 50 0.334 0.703 0.604 0.039 0.027 0.005 -0.397 0.135 0.654
## 51 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 52 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 53 0.120 0.574 0.542 0.036 0.030 0.005 -0.146 0.041 0.998
## 54 0.310 0.707 0.565 0.039 0.027 0.005 -0.354 0.191 0.918
## 55 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 56 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 57 0.999 1.000 1.000 0.003 0.002 0.000 -3.463 -0.014 0.757
## 58 0.010 0.498 0.512 0.032 0.031 0.005 -0.013 -0.019 0.840
## 59 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 60 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 61 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 62 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 63 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 64 0.889 0.940 0.963 0.024 0.016 0.002 -1.669 -0.118 0.453
## 65 0.930 0.963 0.974 0.019 0.012 0.002 -1.862 -0.073 0.650
## 66 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 67 0.869 0.930 0.949 0.025 0.016 0.002 -1.558 -0.079 0.124
## 68 0.927 0.961 0.974 0.020 0.013 0.002 -1.849 -0.089 0.495
## 69 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 70 0.905 0.945 0.977 0.026 0.018 0.002 -1.797 -0.202 0.605
## 71 0.914 0.971 0.857 0.014 0.006 0.004 -1.480 0.414 0.567
## 72 0.405 0.806 0.485 0.041 0.021 0.005 -0.413 0.450 0.890
## 73 0.479 0.818 0.551 0.038 0.020 0.005 -0.519 0.390 0.381
## 74 0.000 0.747 0.164 0.000 0.031 0.004 0.156 0.822 0.042
## 75 0.000 0.552 0.354 0.000 0.031 0.005 0.121 0.252 0.349
## 76 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 77 0.284 0.701 0.544 0.040 0.027 0.005 -0.319 0.209 0.521
## 78 0.000 0.736 0.168 0.000 0.031 0.004 0.165 0.796 0.052
## 79 0.000 0.583 0.250 0.000 0.033 0.004 0.233 0.442 0.507
## 80 0.000 0.583 0.250 0.000 0.033 0.004 0.233 0.442 0.507
## 81 0.000 0.583 0.250 0.000 0.033 0.004 0.233 0.442 0.507
## 82 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 83 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
samples_rep$Filenames <- filelist
samples_rep$BioID <- sub("[1-3]_151118_BC852HANXX_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))Merging technical replicates
txi$counts <- sapply(split.data.frame(t(txi$counts),samples_rep$BioID),colSums)
txi$length <- sapply(split.data.frame(t(txi$length),samples_rep$BioID),colMaxs)Counts are now in alphabetic order, check and reorder if necessary
stopifnot(colnames(txi$counts) == samples_rep$BioID)
samples_rep <- samples_rep[match(colnames(txi$counts),samples_rep$BioID),]Recreate the dds
dds <- DESeqDataSetFromTximport(
txi=txi,
colData = samples_rep,
design = ~ Level)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using counts and average transcript lengths from tximport
save(dds,file=here("data/analysis/salmon/dds_merge.rda"))
⭐ The data are quite good so we can continue with our analysis
txi <- suppressMessages(tximport(files = samples_rep$Filenames,
type = "salmon",
tx2gene=tx2gene,
countsFromAbundance="lengthScaledTPM"))
counts <- txi$counts
samples_rep$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
colnames(counts) <- samples_rep$Filenames
dds <- DESeqDataSetFromTximport(
txi=txi,
colData = samples_rep,
design = ~ Level)## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using just counts from tximport
colnames(dds) <- samples_rep$Filenames
save(dds,file=here("data/analysis/salmon/dds_lengthScaledTPM.rda"))sessionInfo()## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] emoji_15.0 vsn_3.64.0
## [3] tximport_1.24.0 forcats_0.5.2
## [5] stringr_1.5.0 dplyr_1.0.10
## [7] purrr_1.0.0 readr_2.1.3
## [9] tidyr_1.2.1 tibble_3.1.8
## [11] tidyverse_1.3.2 pvclust_2.2-0
## [13] plotly_4.10.1 pheatmap_1.0.12
## [15] magrittr_2.0.3 hyperSpec_0.100.0
## [17] xml2_1.3.3 ggplot2_3.4.0
## [19] lattice_0.20-45 here_1.0.1
## [21] gplots_3.1.3 DESeq2_1.36.0
## [23] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [25] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [27] GenomicRanges_1.50.1 GenomeInfoDb_1.34.4
## [29] IRanges_2.32.0 S4Vectors_0.36.1
## [31] BiocGenerics_0.44.0 data.table_1.14.6
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 lazyeval_0.2.2
## [4] splines_4.2.0 crosstalk_1.2.0 BiocParallel_1.32.4
## [7] digest_0.6.31 htmltools_0.5.4 fansi_1.0.3
## [10] memoise_2.0.1 googlesheets4_1.0.1 limma_3.52.4
## [13] tzdb_0.3.0 Biostrings_2.66.0 annotate_1.74.0
## [16] modelr_0.1.10 vroom_1.6.0 timechange_0.1.1
## [19] jpeg_0.1-10 colorspace_2.0-3 blob_1.2.3
## [22] rvest_1.0.3 haven_2.5.1 xfun_0.35
## [25] hexbin_1.28.2 crayon_1.5.2 RCurl_1.98-1.9
## [28] jsonlite_1.8.4 genefilter_1.78.0 survival_3.4-0
## [31] glue_1.6.2 gtable_0.3.1 gargle_1.2.1
## [34] zlibbioc_1.44.0 XVector_0.38.0 DelayedArray_0.24.0
## [37] scales_1.2.1 DBI_1.1.3 Rcpp_1.0.9
## [40] viridisLite_0.4.1 xtable_1.8-4 bit_4.0.5
## [43] preprocessCore_1.58.0 htmlwidgets_1.6.1 httr_1.4.4
## [46] RColorBrewer_1.1-3 ellipsis_0.3.2 farver_2.1.1
## [49] pkgconfig_2.0.3 XML_3.99-0.13 sass_0.4.4
## [52] dbplyr_2.2.1 deldir_1.0-6 locfit_1.5-9.7
## [55] utf8_1.2.2 labeling_0.4.2 tidyselect_1.2.0
## [58] rlang_1.0.6 AnnotationDbi_1.60.0 munsell_0.5.0
## [61] cellranger_1.1.0 tools_4.2.0 cachem_1.0.6
## [64] cli_3.5.0 generics_0.1.3 RSQLite_2.2.20
## [67] broom_1.0.2 evaluate_0.19 fastmap_1.1.0
## [70] yaml_2.3.6 knitr_1.41 bit64_4.0.5
## [73] fs_1.5.2 caTools_1.18.2 KEGGREST_1.38.0
## [76] brio_1.1.3 compiler_4.2.0 rstudioapi_0.14
## [79] png_0.1-8 testthat_3.1.6 affyio_1.66.0
## [82] reprex_2.0.2 geneplotter_1.74.0 bslib_0.4.2
## [85] stringi_1.7.8 highr_0.10 Matrix_1.5-3
## [88] vctrs_0.5.1 pillar_1.8.1 lifecycle_1.0.3
## [91] BiocManager_1.30.19 jquerylib_0.1.4 bitops_1.0-7
## [94] R6_2.5.1 latticeExtra_0.6-30 affy_1.74.0
## [97] KernSmooth_2.23-20 codetools_0.2-18 gtools_3.9.4
## [100] assertthat_0.2.1 rprojroot_2.0.3 withr_2.5.0
## [103] GenomeInfoDbData_1.2.9 hms_1.1.2 rmarkdown_2.19
## [106] googledrive_2.0.0 lubridate_1.9.0 interp_1.1-3